{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
    "import sys\n",
    "sys.path.append(\"../\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/yuwei/miniconda3/envs/llmembapi/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from lm_encoders_hf import MaskededLMEncoder\n",
    "from torch.nn.functional import cosine_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Embeddings will take up too much memory!\n"
     ]
    }
   ],
   "source": [
    "model = MaskededLMEncoder(\n",
    "    model_name_or_path=\"BrandonZYW/roberta-large-InBedder\",\n",
    "    mask_length=3\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set to last layer\n",
    "model.set_output_value(\"avg_gen_layer_24\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['### Input:\\nI like dog\\n\\n### Instruction:\\nDo they like animals?\\n\\n### Response:', '### Input:\\nI like cat\\n\\n### Instruction:\\nDo they like animals?\\n\\n### Response:', '### Input:\\nI hate dog\\n\\n### Instruction:\\nDo they like animals?\\n\\n### Response:']\n"
     ]
    }
   ],
   "source": [
    "instruction=\"Do they like animals?\"\n",
    "corpus = [\"I like dog\", \"I like cat\", \"I hate dog\"]\n",
    "pattern = \"### Input:\\n{input}\\n\\n### Instruction:\\n{instruction}\\n\\n### Response:\"\n",
    "corpus = [pattern.replace('{input}', s).replace('{instruction}', instruction) for s in corpus]\n",
    "print(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings, generations = model.encode(\n",
    "    corpus,\n",
    "    batch_size=3,\n",
    "    cache_dir=None, # useful when you want to reuse the embeddings\n",
    "    return_generations=True, # useful if you want to look at your generations,\n",
    "    convert_to_tensor=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"###Input:I like dog.### InstructionDo they like animals###ResponseYes't dog\",\n",
       " \"###Input:I like cat.### InstructionDo they like animals###ResponseYes't animals\",\n",
       " \"###Input:I hate dog.### InstructionDo they like animals###Responsen't dog\"]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(True)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim1 = cosine_similarity(embeddings[0], embeddings[1], dim=0)\n",
    "sim2 = cosine_similarity(embeddings[0], embeddings[2], dim=0)\n",
    "sim1 > sim2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['### Input:\\nI like dog\\n\\n### Instruction:\\nWhat animal are they talking about?\\n\\n### Response:', '### Input:\\nI like cat\\n\\n### Instruction:\\nWhat animal are they talking about?\\n\\n### Response:', '### Input:\\nI hate dog\\n\\n### Instruction:\\nWhat animal are they talking about?\\n\\n### Response:']\n"
     ]
    }
   ],
   "source": [
    "instruction=\"What animal are they talking about?\"\n",
    "corpus = [\"I like dog\", \"I like cat\", \"I hate dog\"]\n",
    "pattern = \"### Input:\\n{input}\\n\\n### Instruction:\\n{instruction}\\n\\n### Response:\"\n",
    "corpus = [pattern.replace('{input}', s).replace('{instruction}', instruction) for s in corpus]\n",
    "print(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings, generations = model.encode(\n",
    "    corpus,\n",
    "    batch_size=3,\n",
    "    cache_dir=None, # useful when you want to reuse the embeddings\n",
    "    return_generations=True, # useful if you want to look at your generations,\n",
    "    convert_to_tensor=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['###Input:\"I like dog.### InstructionWhat animal are they talking about-###Response-dogoy dog',\n",
       " '###Input:\"I like cat.### InstructionWhat animal are they talking about-###Response-citt cat',\n",
       " '###Input:\"I hate dog.### InstructionWhat animal are they talking about-###Response-poy dog']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(True)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim1 = cosine_similarity(embeddings[0], embeddings[1], dim=0)\n",
    "sim2 = cosine_similarity(embeddings[0], embeddings[2], dim=0)\n",
    "sim1 < sim2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llmembapi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
